#2017 Analysis:

library(ggplot2)
library(corrplot)
## corrplot 0.92 loaded
library(magrittr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
nyc_df_17<- read.csv('/Users/nigel/Downloads/nyc_clean.csv',header = TRUE)
nyc_df_17 <- nyc_df_17 %>% filter(Year == 2017)

# Top 10 Counties with highest Percent of Children with Elevated Blood Levels
nyc_df_17_top_10 <- nyc_df_17[order(-nyc_df_17$Total.Elevated.Blood.Levels), ]


top_10_counties <- head(nyc_df_17_top_10, 10)

library(ggplot2)

ggplot(top_10_counties, aes(x = reorder(County, Total.Elevated.Blood.Levels), y = Total.Elevated.Blood.Levels, fill = County)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = scales::percent(Total.Elevated.Blood.Levels/sum(Total.Elevated.Blood.Levels))), position = position_stack(vjust = 0.5)) +
  labs(title = "Top 10 Counties with Highest Percent of Children with EBLLs", x = "County", y = "Number of Children with EBLLs") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Bottom 10 Counties (Not used just for reference)

nyc_df_17_bottom_10 <- nyc_df_17[order(nyc_df_17$Total.Elevated.Blood.Levels), ]
bottom_10_counties <- head(nyc_df_17_bottom_10, 10)
ggplot(bottom_10_counties, aes(x = reorder(County, desc(Total.Elevated.Blood.Levels)), y = Total.Elevated.Blood.Levels, fill = County)) +
  geom_bar(stat = "identity", width = 0.5) +
  labs(title = "Bottom 10 Counties with Lowest Number of Children with EBLLs", x = "County", y = "Number of Children with EBLLs") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  geom_text(aes(label = Total.Elevated.Blood.Levels), vjust = -0.5, size = 3, color = "black")

#Median Income Comparison between Counties with Lowest and Highest EBLLs
income_county_data <- rbind(
  data.frame(Group = "Lowest EBLL", County = factor(bottom_10_counties$County, levels = unique(bottom_10_counties$County)), Per.Capita.Income = bottom_10_counties$Per.Capita.Income),
  data.frame(Group = "Highest EBLL", County = factor(top_10_counties$County, levels = unique(top_10_counties$County)), Per.Capita.Income = top_10_counties$Per.Capita.Income)
)

# Bar graph (Not Recommended)
ggplot(income_county_data, aes(x = County, y = Per.Capita.Income, fill = Group)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Median Income Comparison between Counties with Lowest and Highest EBLLs",
       x = "County", y = "Median Income") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Scatter Plot(Use this)
ggplot(income_county_data, aes(x = County, y = Per.Capita.Income, color = Group)) +
  geom_point(size = 4, alpha = 0.7, position = position_jitter(width = 0.3)) +
  geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color = "black") +
  labs(title = "Median Income Comparison between Counties with Lowest and Highest EBLLs",
       x = "County", y = "Median Income") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "top") +
  scale_color_manual(values = c("Lowest EBLL" = "#4e79a7", "Highest EBLL" = "#e15759")) +
  scale_fill_manual(values = c("Lowest EBLL" = "#4e79a7", "Highest EBLL" = "#e15759"))
## `geom_smooth()` using formula = 'y ~ x'

# Plot the data points and the regression line
ggplot(nyc_df_17, aes(x = Total.Elevated.Blood.Levels, y = Per.Capita.Income)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Linear Regression",
       x = "Number of Children with EBLLs",
       y = "Income 2019")
## `geom_smooth()` using formula = 'y ~ x'

model <- lm(Per.Capita.Income ~ Total.Elevated.Blood.Levels, data = nyc_df_17)
# Plot the data points and the regression line
ggplot(nyc_df_17, aes(x = Total.Elevated.Blood.Levels, y = Per.Capita.Income)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Linear Regression",
       x = "Number of Children with EBLLs",
       y = "Income 2017")
## `geom_smooth()` using formula = 'y ~ x'

# KMeans Clustering for EBLL and Income

set.seed(123) 
data_for_clustering <- nyc_df_17[, c("Total.Elevated.Blood.Levels", "Per.Capita.Income")]
scaled_data <- scale(data_for_clustering)


k <- 3
kmeans_model <- kmeans(scaled_data, centers = k)
nyc_df_17_copy<- nyc_df_17
nyc_df_17_copy$cluster <- as.factor(kmeans_model$cluster)



ggplot(nyc_df_17_copy, aes(x = Total.Elevated.Blood.Levels, y = Per.Capita.Income, color = cluster)) +
  geom_point() +
  labs(title = "K-Means Clustering",
       x = "Number of Children with EBLLs",
       y = " Median Income ") +
  scale_color_discrete(name = "Cluster")

#rate of elev vs percent change 

# Children Tested vs Income and rate of elevation 
custom_theme <- theme_minimal() +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold", color = "darkblue"),
    axis.title = element_text(size = 16, face = "bold"),
    axis.text = element_text(size = 14),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12)
  )


ggplot(nyc_df_17, aes(x = Tests, y = Per.Capita.Income, size = Tests, color = Rate.per.1.000)) +
  geom_point(alpha = 0.7) +
  scale_color_gradient(low = "blue", high = "orange") +
  scale_size_continuous(range = c(3, 15)) +
  labs(title = "Children Tested vs Income",
       x = "Number of Children Tested",
       y = "Income",
       size = "Number of Children Tested",
       color = "Rate of Elevation in EBLLs") +
  custom_theme

# Children Tested vs Income Plot Zoomed in on Clustered Area(Axis Changed)
custom_theme <- theme_minimal() +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold", color = "darkblue"),
    axis.title = element_text(size = 16, face = "bold"),
    axis.text = element_text(size = 14),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12)
  )

ggplot(nyc_df_17, aes(x = Tests, y = Per.Capita.Income, size = Tests, color = Rate.per.1.000)) +
  geom_point(alpha = 0.7) +
  scale_color_gradient(low = "blue", high = "orange") +
  scale_size_continuous(range = c(3, 15)) +
  labs(title = "Children Tested vs Income",
       x = "Number of Children Tested",
       y = "Income",
       size = "Number of Children Tested",
       color = "Rate of Elevation in EBLLs") +
  custom_theme +
  coord_cartesian(xlim = c(0, 5000), ylim = c(26000, 85000))

#2018 Analysis:

library(ggplot2)
library(corrplot)
library(magrittr)
library(dplyr)
nyc_df_18<- read.csv('/Users/nigel/Downloads/nyc_clean.csv',header = TRUE)
nyc_df_18 <- nyc_df_18 %>% filter(Year == 2018)

# Top 10 Counties with highest Percent of Children with Elevated Blood Levels
nyc_df_18_top_10 <- nyc_df_18[order(-nyc_df_18$Total.Elevated.Blood.Levels), ]


top_10_counties <- head(nyc_df_18_top_10, 10)

library(ggplot2)

ggplot(top_10_counties, aes(x = reorder(County, Total.Elevated.Blood.Levels), y = Total.Elevated.Blood.Levels, fill = County)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = scales::percent(Total.Elevated.Blood.Levels/sum(Total.Elevated.Blood.Levels))), position = position_stack(vjust = 0.5)) +
  labs(title = "Top 10 Counties with Highest Percent of Children with EBLLs", x = "County", y = "Number of Children with EBLLs") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Bottom 10 Counties (Not used just for reference)

nyc_df_18_bottom_10 <- nyc_df_18[order(nyc_df_18$Total.Elevated.Blood.Levels), ]
bottom_10_counties <- head(nyc_df_18_bottom_10, 10)
ggplot(bottom_10_counties, aes(x = reorder(County, desc(Total.Elevated.Blood.Levels)), y = Total.Elevated.Blood.Levels, fill = County)) +
  geom_bar(stat = "identity", width = 0.5) +
  labs(title = "Bottom 10 Counties with Lowest Number of Children with EBLLs", x = "County", y = "Number of Children with EBLLs") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  geom_text(aes(label = Total.Elevated.Blood.Levels), vjust = -0.5, size = 3, color = "black")

#Median Income Comparison between Counties with Lowest and Highest EBLLs
income_county_data <- rbind(
  data.frame(Group = "Lowest EBLL", County = factor(bottom_10_counties$County, levels = unique(bottom_10_counties$County)), Per.Capita.Income = bottom_10_counties$Per.Capita.Income),
  data.frame(Group = "Highest EBLL", County = factor(top_10_counties$County, levels = unique(top_10_counties$County)), Per.Capita.Income = top_10_counties$Per.Capita.Income)
)

# Bar graph (Not Recommended)
ggplot(income_county_data, aes(x = County, y = Per.Capita.Income, fill = Group)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Median Income Comparison between Counties with Lowest and Highest EBLLs",
       x = "County", y = "Median Income") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Scatter Plot(Use this)
ggplot(income_county_data, aes(x = County, y = Per.Capita.Income, color = Group)) +
  geom_point(size = 4, alpha = 0.7, position = position_jitter(width = 0.3)) +
  geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color = "black") +
  labs(title = "Median Income Comparison between Counties with Lowest and Highest EBLLs",
       x = "County", y = "Median Income") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "top") +
  scale_color_manual(values = c("Lowest EBLL" = "#4e79a7", "Highest EBLL" = "#e15759")) +
  scale_fill_manual(values = c("Lowest EBLL" = "#4e79a7", "Highest EBLL" = "#e15759"))
## `geom_smooth()` using formula = 'y ~ x'

# Plot the data points and the regression line
ggplot(nyc_df_18, aes(x = Total.Elevated.Blood.Levels, y = Per.Capita.Income)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Linear Regression",
       x = "Number of Children with EBLLs",
       y = "Income 2019")
## `geom_smooth()` using formula = 'y ~ x'

model <- lm(Per.Capita.Income ~ Total.Elevated.Blood.Levels, data = nyc_df_18)
# Plot the data points and the regression line
ggplot(nyc_df_18, aes(x = Total.Elevated.Blood.Levels, y = Per.Capita.Income)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Linear Regression",
       x = "Number of Children with EBLLs",
       y = "Income 2018")
## `geom_smooth()` using formula = 'y ~ x'

# KMeans Clustering for EBLL and Income

set.seed(123) 
data_for_clustering <- nyc_df_18[, c("Total.Elevated.Blood.Levels", "Per.Capita.Income")]
scaled_data <- scale(data_for_clustering)


k <- 3
kmeans_model <- kmeans(scaled_data, centers = k)
nyc_df_18_copy<- nyc_df_18
nyc_df_18_copy$cluster <- as.factor(kmeans_model$cluster)



ggplot(nyc_df_18_copy, aes(x = Total.Elevated.Blood.Levels, y = Per.Capita.Income, color = cluster)) +
  geom_point() +
  labs(title = "K-Means Clustering",
       x = "Number of Children with EBLLs",
       y = " Median Income ") +
  scale_color_discrete(name = "Cluster")

#rate of elev vs percent change 

# Children Tested vs Income and rate of elevation 
custom_theme <- theme_minimal() +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold", color = "darkblue"),
    axis.title = element_text(size = 16, face = "bold"),
    axis.text = element_text(size = 14),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12)
  )


ggplot(nyc_df_18, aes(x = Tests, y = Per.Capita.Income, size = Tests, color = Rate.per.1.000)) +
  geom_point(alpha = 0.7) +
  scale_color_gradient(low = "blue", high = "orange") +
  scale_size_continuous(range = c(3, 15)) +
  labs(title = "Children Tested vs Income",
       x = "Number of Children Tested",
       y = "Income",
       size = "Number of Children Tested",
       color = "Rate of Elevation in EBLLs") +
  custom_theme

# Children Tested vs Income Plot Zoomed in on Clustered Area(Axis Changed)
custom_theme <- theme_minimal() +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold", color = "darkblue"),
    axis.title = element_text(size = 16, face = "bold"),
    axis.text = element_text(size = 14),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12)
  )

ggplot(nyc_df_18, aes(x = Tests, y = Per.Capita.Income, size = Tests, color = Rate.per.1.000)) +
  geom_point(alpha = 0.7) +
  scale_color_gradient(low = "blue", high = "orange") +
  scale_size_continuous(range = c(3, 15)) +
  labs(title = "Children Tested vs Income",
       x = "Number of Children Tested",
       y = "Income",
       size = "Number of Children Tested",
       color = "Rate of Elevation in EBLLs") +
  custom_theme +
  coord_cartesian(xlim = c(0, 5000), ylim = c(26000, 85000))

#2019 Analysis:

library(ggplot2)
library(corrplot)
library(magrittr)
library(dplyr)
nyc_df<- read.csv('/Users/nigel/Downloads/nyc_clean.csv',header = TRUE)
nyc_df <- nyc_df %>% filter(Year == 2019)

# Top 10 Counties with highest Percent of Children with Elevated Blood Levels
nyc_df_top_10 <- nyc_df[order(-nyc_df$Total.Elevated.Blood.Levels), ]


top_10_counties <- head(nyc_df_top_10, 10)

library(ggplot2)

ggplot(top_10_counties, aes(x = reorder(County, Total.Elevated.Blood.Levels), y = Total.Elevated.Blood.Levels, fill = County)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = scales::percent(Total.Elevated.Blood.Levels/sum(Total.Elevated.Blood.Levels))), position = position_stack(vjust = 0.5)) +
  labs(title = "Top 10 Counties with Highest Percent of Children with EBLLs", x = "County", y = "Number of Children with EBLLs") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Bottom 10 Counties (Not used just for reference)

nyc_df_bottom_10 <- nyc_df[order(nyc_df$Total.Elevated.Blood.Levels), ]
bottom_10_counties <- head(nyc_df_bottom_10, 10)
ggplot(bottom_10_counties, aes(x = reorder(County, desc(Total.Elevated.Blood.Levels)), y = Total.Elevated.Blood.Levels, fill = County)) +
  geom_bar(stat = "identity", width = 0.5) +
  labs(title = "Bottom 10 Counties with Lowest Number of Children with EBLLs", x = "County", y = "Number of Children with EBLLs") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  geom_text(aes(label = Total.Elevated.Blood.Levels), vjust = -0.5, size = 3, color = "black")

#Median Income Comparison between Counties with Lowest and Highest EBLLs
income_county_data <- rbind(
  data.frame(Group = "Lowest EBLL", County = factor(bottom_10_counties$County, levels = unique(bottom_10_counties$County)), Per.Capita.Income = bottom_10_counties$Per.Capita.Income),
  data.frame(Group = "Highest EBLL", County = factor(top_10_counties$County, levels = unique(top_10_counties$County)), Per.Capita.Income = top_10_counties$Per.Capita.Income)
)

# Bar graph (Not Recommended)
ggplot(income_county_data, aes(x = County, y = Per.Capita.Income, fill = Group)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Median Income Comparison between Counties with Lowest and Highest EBLLs",
       x = "County", y = "Median Income") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Scatter Plot(Use this)
ggplot(income_county_data, aes(x = County, y = Per.Capita.Income, color = Group)) +
  geom_point(size = 4, alpha = 0.7, position = position_jitter(width = 0.3)) +
  geom_smooth(method = "lm", se = FALSE, linetype = "dashed", color = "black") +
  labs(title = "Median Income Comparison between Counties with Lowest and Highest EBLLs",
       x = "County", y = "Median Income") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "top") +
  scale_color_manual(values = c("Lowest EBLL" = "#4e79a7", "Highest EBLL" = "#e15759")) +
  scale_fill_manual(values = c("Lowest EBLL" = "#4e79a7", "Highest EBLL" = "#e15759"))
## `geom_smooth()` using formula = 'y ~ x'

# Plot the data points and the regression line
ggplot(nyc_df, aes(x = Total.Elevated.Blood.Levels, y = Per.Capita.Income)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Linear Regression",
       x = "Number of Children with EBLLs",
       y = "Income 2019")
## `geom_smooth()` using formula = 'y ~ x'

model <- lm(Per.Capita.Income ~ Total.Elevated.Blood.Levels, data = nyc_df)
# Plot the data points and the regression line
ggplot(nyc_df, aes(x = Total.Elevated.Blood.Levels, y = Per.Capita.Income)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Linear Regression",
       x = "Number of Children with EBLLs",
       y = "Income 2019")
## `geom_smooth()` using formula = 'y ~ x'

# KMeans Clustering for EBLL and Income

set.seed(123) 
data_for_clustering <- nyc_df[, c("Total.Elevated.Blood.Levels", "Per.Capita.Income")]
scaled_data <- scale(data_for_clustering)


k <- 3
kmeans_model <- kmeans(scaled_data, centers = k)
nyc_df_copy<- nyc_df
nyc_df_copy$cluster <- as.factor(kmeans_model$cluster)



ggplot(nyc_df_copy, aes(x = Total.Elevated.Blood.Levels, y = Per.Capita.Income, color = cluster)) +
  geom_point() +
  labs(title = "K-Means Clustering",
       x = "Number of Children with EBLLs",
       y = " Median Income ") +
  scale_color_discrete(name = "Cluster")

#rate of elev vs percent change 

# Children Tested vs Income and rate of elevation 
custom_theme <- theme_minimal() +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold", color = "darkblue"),
    axis.title = element_text(size = 16, face = "bold"),
    axis.text = element_text(size = 14),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12)
  )


ggplot(nyc_df, aes(x = Tests, y = Per.Capita.Income, size = Tests, color = Rate.per.1.000)) +
  geom_point(alpha = 0.7) +
  scale_color_gradient(low = "blue", high = "orange") +
  scale_size_continuous(range = c(3, 15)) +
  labs(title = "Children Tested vs Income",
       x = "Number of Children Tested",
       y = "Income",
       size = "Number of Children Tested",
       color = "Rate of Elevation in EBLLs") +
  custom_theme

# Children Tested vs Income Plot Zoomed in on Clustered Area(Axis Changed)
custom_theme <- theme_minimal() +
  theme(
    plot.title = element_text(size = 20, hjust = 0.5, face = "bold", color = "darkblue"),
    axis.title = element_text(size = 16, face = "bold"),
    axis.text = element_text(size = 14),
    legend.title = element_text(size = 14),
    legend.text = element_text(size = 12)
  )

ggplot(nyc_df, aes(x = Tests, y = Per.Capita.Income, size = Tests, color = Rate.per.1.000)) +
  geom_point(alpha = 0.7) +
  scale_color_gradient(low = "blue", high = "orange") +
  scale_size_continuous(range = c(3, 15)) +
  labs(title = "Children Tested vs Income",
       x = "Number of Children Tested",
       y = "Income",
       size = "Number of Children Tested",
       color = "Rate of Elevation in EBLLs") +
  custom_theme +
  coord_cartesian(xlim = c(0, 5000), ylim = c(26000, 85000))

 #install.packages("leaflet")

library(dplyr)
library(ggplot2)
library(corrplot)
#library(leaflet)

# Assuming your dataframe is named 'your_dataframe'
# Replace this with your actual dataframe
your_dataframe <- read.csv('/Users/nigel/Downloads/nyc_clean.csv')

# 1. Summary Statistics
summary(your_dataframe)
##     County           County.Code         Year          Tests       
##  Length:265         Min.   :  1.0   Min.   :2016   Min.   :   6.0  
##  Class :character   1st Qu.: 29.0   1st Qu.:2017   1st Qu.:  71.0  
##  Mode  :character   Median : 59.0   Median :2018   Median : 142.0  
##                     Mean   : 62.7   Mean   :2018   Mean   : 484.7  
##                     3rd Qu.: 97.0   3rd Qu.:2019   3rd Qu.: 411.0  
##                     Max.   :123.0   Max.   :2020   Max.   :6291.0  
##                                                                    
##  Less.than.5.mcg.dL  X5.10.mcg.dL    X10...15.mcg.dL    X15...mcg.dL   
##  Min.   :   6       Min.   :  0.00   Min.   :  0.000   Min.   : 0.000  
##  1st Qu.:  71       1st Qu.:  0.00   1st Qu.:  0.000   1st Qu.: 0.000  
##  Median : 141       Median :  0.00   Median :  0.000   Median : 0.000  
##  Mean   : 465       Mean   : 14.53   Mean   :  2.691   Mean   : 2.525  
##  3rd Qu.: 411       3rd Qu.:  7.00   3rd Qu.:  0.000   3rd Qu.: 0.000  
##  Max.   :6273       Max.   :623.00   Max.   :105.000   Max.   :87.000  
##                                                                        
##  Total.Elevated.Blood.Levels    Percent        Rate.per.1.000  
##  Min.   :  0.00              Min.   :0.00000   Min.   :  0.00  
##  1st Qu.:  0.00              1st Qu.:0.00000   1st Qu.:  0.00  
##  Median :  0.00              Median :0.00000   Median :  0.00  
##  Mean   : 19.74              Mean   :0.01693   Mean   : 13.28  
##  3rd Qu.:  7.00              3rd Qu.:0.01800   3rd Qu.:  0.00  
##  Max.   :814.00              Max.   :0.24500   Max.   :508.00  
##                                                                
##  Per.Capita.Income Rank.in.State
##  Min.   : 34116    Min.   : 1   
##  1st Qu.: 41896    1st Qu.:14   
##  Median : 45120    Median :27   
##  Mean   : 49047    Mean   :27   
##  3rd Qu.: 51558    3rd Qu.:40   
##  Max.   :113477    Max.   :53   
##  NA's   :106       NA's   :106
ggplot(your_dataframe, aes(x = County, y = Tests, fill = County)) +
  geom_boxplot() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Explore the time series data
ggplot(your_dataframe, aes(x = Year, y = Total.Elevated.Blood.Levels)) +
  geom_line() +
  labs(title = "Lead Levels Over Time")

# Assuming your dataset is named 'your_data'
# Replace 'your_data' with your actual dataset name

# Load necessary libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ✔ readr     2.1.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ tidyr::extract()   masks magrittr::extract()
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ purrr::set_names() masks magrittr::set_names()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tsibble)
## 
## Attaching package: 'tsibble'
## 
## The following object is masked from 'package:lubridate':
## 
##     interval
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
your_data <- read.csv('/Users/nigel/Downloads/nyc_clean.csv')

# Convert the data to a tsibble format
your_data_ts <- your_data %>%
  as_tsibble(key = County, index = Year) %>%
  arrange(County, Year)  # Make sure data is sorted by County and Year

# Explore the time series data (plot as kernel density plot)
ggplot(your_data_ts, aes(x = Total.Elevated.Blood.Levels)) +
  geom_density(fill = "blue", alpha = 0.5) +
  labs(title = "Kernel Density of Lead Levels Over Time",
       x = "Lead Levels (mcg/dL)",
       y = "Density")+xlim(c(0,50))
## Warning: Removed 24 rows containing non-finite values (`stat_density()`).

#Start of Timeseries Code
library(tidyverse)
library(tsibble)
library(forecast)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
ts=read.csv('/Users/nigel/Downloads/nyc_clean.csv')
ts <- ts %>%
  filter(Year >= 2017 & Year <= 2019)


ts$Year<-as.factor(ts$Year)

# Top 10 EBLL Counties
top_counties <- ts %>%
  group_by(County) %>%
  summarise(total_eblls = sum(Total.Elevated.Blood.Levels)) %>%
  arrange(desc(total_eblls)) %>%
  slice(1:10) %>%
  pull(County)



# Filter Top 10 EBLL Counties
df_top_10 <- ts %>%
  filter(County %in% top_counties)

# Plotly plot for Top 10 based on EBLL and Year
plot_ly(df_top_10, x = ~Year, y = ~Total.Elevated.Blood.Levels, color = ~County, type = "scatter", mode = "lines") %>%
  layout(title = "Top 10 Counties with Highest levels of EBLL",
         xaxis = list(title = "Year"),
         yaxis = list(title = "Number of Children with EBLLs"))
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
# Plotly plot for Top 10 Counties with highest EBLL based on Income and Year
plot_ly(df_top_10, x = ~Year, y = ~Per.Capita.Income, color = ~County, type = "scatter", mode = "lines") %>%
  layout(title = "Median Income of Top 10 Counties with highest EBLL",
         xaxis = list(title = "Year"),
         yaxis = list(title = "Median Income"))
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
# Combining both
plot_ly() %>%
  add_trace(data = df_top_10, x = ~Year, y = ~Total.Elevated.Blood.Levels, color = ~County,
            type = "scatter", mode = "lines", name = "Number of Children with EBLLs",
            hoverinfo = "text", text = ~paste("County: ", County, "<br>Year: ", Year, "<br>EBLLs: ", Total.Elevated.Blood.Levels)) %>%
  add_trace(data = df_top_10, x = ~Year, y = ~Per.Capita.Income, color = ~County,
            type = "scatter", mode = "lines", name = "Median Income",
            yaxis = "y2", hoverinfo = "text", text = ~paste("County: ", County, "<br>Year: ", Year, "<br>Income: ", Per.Capita.Income)) %>%
  layout(title = "Top 10 Counties with Highest Levels of EBLL and their Median Income",
         xaxis = list(title = "Year"),
         yaxis = list(title = "Number of Children with EBLLs", side = "left"),
         yaxis2 = list(title = "Median Income", side = "right", overlaying = "y", showgrid = FALSE),
         showlegend = FALSE,
         hovermode = "closest")
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
# Top 10 Elevation Rate Counties
top_elevation_counties <- ts %>%
  group_by(County) %>%
  summarise(total_elevation = sum(Rate.per.1.000)) %>%
  arrange(desc(total_elevation)) %>%
  slice(1:10)




# Filter Top 10 Elevation Rate Counties
df_top_10_elevation <- ts %>%
  filter(County %in% top_elevation_counties$County)

plot_ly(df_top_10_elevation, x = ~Year, y = ~Rate.per.1.000, color = ~County, type = "scatter", mode = "lines") %>%
  layout(title = "Top 10 Counties with highest Rate of Elevation",
         xaxis = list(title = "Year"),
         yaxis = list(title = "Rate Of Elevation"))
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
# Forecast Blood Levels in 2020
ts_subset <- ts[ts$Year %in% c(2017, 2018, 2019), ]
counties <- unique(ts_subset$County)

forecast_df <- data.frame(County = character(), Forecasted_EBLLs_2020 = numeric(), stringsAsFactors = FALSE)


for (county in counties) {
  county_data <- ts_subset$Total.Elevated.Blood.Levels[ts_subset$County == county]
  arima_model <- auto.arima(county_data)
  

  forecast_result <- forecast(arima_model, h = 1)
  forecast_value <- forecast_result$mean[1]
  
  forecast_df <- rbind(forecast_df, data.frame(County = county, Forecasted_EBLLs_2020 = forecast_value))
}


sorted_forecast_df <- forecast_df[order(-forecast_df$Forecasted_EBLLs_2020), ]
top_10_counties_forecast <- head(sorted_forecast_df, 10)


library(ggplot2)

# Create a bar plot for the top 10 counties forecast 
ggplot(top_10_counties_forecast, aes(x = reorder(County, -Forecasted_EBLLs_2020), y = Forecasted_EBLLs_2020, fill = County)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = scales::percent(Forecasted_EBLLs_2020/sum(Forecasted_EBLLs_2020))), position = position_stack(vjust = 0.5)) +
  labs(title = "Top 10 Counties with Highest Forecasted Blood Levels in 2020", x = "County", y = "Forecasted Blood Levels") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))